Contents

%run set_theme.ipynb
import os

from plotly.offline import init_notebook_mode
import plotly.colors as pc
import plotly.express as px
import numpy as np
import pandas as pd
import numpy as np

init_notebook_mode()
RECALCULATE = False
PRINT = False
if not os.path.isfile('cache/gender_devtype.pq') or RECALCULATE:
    full_df = pd.read_parquet('../data/SO_2014_2022.pq')
    devtypes = list(full_df['DevType'].dropna().apply(lambda s: s.split(';')[0]).unique())
    dt_series = full_df.DevType.apply(lambda s: s.split(';') if isinstance(s, str) else np.nan)
    dt_mean_sal = pd.DataFrame(columns=['Salary'], index=devtypes)
    dt_gender_count = pd.DataFrame(columns=['Male', 'Female'], index=devtypes)
    dt_gender_sal = pd.DataFrame(columns=['Male', 'Female'], index=devtypes)

    devtype = None


    # Loop closure
    def filter_devtype(types: list, value):
        if devtype not in types:
            return np.nan
        return value


    search_mean_sal = pd.concat([dt_series, full_df.Salary], axis=1).dropna()
    search_gender_count = pd.concat([dt_series, full_df.Gender], axis=1).dropna()
    search_gender_sal = pd.concat([dt_series, full_df.Salary, full_df.Gender], axis=1).dropna()
    for progress, devtype in enumerate(devtypes):
        if PRINT:
            print(f'{progress / len(devtypes):>4.0%}')
        dt_mean_sal.loc[devtype] = search_mean_sal.DevType.combine(search_mean_sal.Salary, filter_devtype).mean()
        dt_gender_count.loc[devtype] = tuple(
            search_gender_count.DevType.combine(search_gender_count.Gender, filter_devtype).value_counts())
        for gender in ['Male', 'Female']:
            sgs = search_gender_sal.query(f"Gender == '{gender.lower()}'")
            dt_gender_sal.loc[devtype][gender] = sgs.DevType.combine(sgs.Salary, filter_devtype).mean()

    if PRINT:
        print('100%')

    df = pd.DataFrame()
    df['Salary'] = dt_mean_sal['Salary']
    df['MaleCount'] = dt_gender_count['Male']
    df['FemaleCount'] = dt_gender_count['Female']
    df['MaleSalary'] = dt_gender_sal['Male']
    df['FemaleSalary'] = dt_gender_sal['Female']
    df['Gap'] = (df['MaleSalary'] - df['FemaleSalary']) / df['Salary'] * 100
    df['Prop'] = df['MaleCount'] / df['FemaleCount']

    # Some positions might be unreliable due to low female respondent count
    df.query('MaleCount > 200 & FemaleCount > 200', inplace=True)

    df.to_parquet('cache/gender_devtype.pq')
    RECALCULATE = False
else:
    df = pd.read_parquet('cache/gender_devtype.pq')
    if PRINT:
        print('Loaded from cache')
df.head()
Salary MaleCount FemaleCount MaleSalary FemaleSalary Gap Prop
Data scientist or machine learning specialist 66741.062337 27967 2135 67065.807725 65158.192406 2.858233 13.099297
Engineer, data 74134.871049 15341 801 73851.015016 75325.595661 -1.989051 19.152310
Developer, front-end 57962.401026 95441 7128 58357.289909 56208.281881 3.707590 13.389590
Student 24490.338723 34176 2740 24634.515929 24305.919094 1.341741 12.472993
Developer, full-stack 62865.299330 182772 9983 63024.234952 63244.182031 -0.349870 18.308324
if PRINT:
    print('Positions with most female favourable pay gap:', *df.sort_values(by='Gap').head(3).index.tolist(),
          sep='\n\t')
    print('\nPositions with most male favourable pay gap:',
          *df.sort_values(by='Gap', ascending=False).head(3).index.tolist(), sep='\n\t')
    print('\nPositions with highest female proportion:',
          *df.sort_values(by='Prop', ascending=False).head(3).index.tolist(), sep='\n\t')
    print('\nPositions with lowest female proportion:', *df.sort_values(by='Prop').head(3).index.tolist(), sep='\n\t')
fig = px.scatter(
    df.reset_index(names='Position'),
    x='Prop',
    y='Gap',
    hover_data={'Position':True, 'Prop':False, 'Gap':False},
    labels={
        'Prop':'Average number of men per woman in position',
        'Gap':'Percentage higher payment of men',
    },
    title='Gender Ratio of Job Positions vs Pay Gap<br><sup>Positions with more female developers seem to suffer from greater payment inequality</sup>',
    trendline='ols',
    trendline_color_override='#343a42',
    color='Prop',
    color_continuous_scale=pc.make_colorscale(['#f854ee', '#4c75eb']),
    width=790,
    size=df.MaleCount.apply(np.sqrt),
    size_max=16
)

fig.update_layout(
    coloraxis_showscale=False,
    margin={'b': 170, 't': 100, 'l': 90},
)

for trace in fig.data:
    trace.hoverlabel = {'font_color': 'white', 'bordercolor': 'white'}
    if 'trendline' in trace.hovertemplate:
        trace.hovertemplate = 'At %{x:d} men per woman, the estimated pay gap would be %{y:.1f}%<extra></extra>'
    else:        
        trace.hovertemplate = 'Position: %{customdata[0]}<extra></extra>'

# Caption with explanation.
fig.add_annotation(x=-0.09, y=-0.55,
                   xref='paper', yref='paper',
                   showarrow=False,
                   xanchor='left',
                   yanchor='bottom',
                   align='left',
                   text='Every position is displayed as a single dot, where larger dots indicate positions with more respondents. The x-axis<br>' +
                        'shows the gender ratio, so dots towards the left (pink) have more women (a value of 10 means ten men per woman).<br>' +
                        'The y-axis shows the pay gap, with positive values favouring men and negative values favouring women.<br>' +
                        'Hover over a dot to see the corresponding position, or over the trendline to see the estimated pay gap.')


fig.show()